################################################################################################
rm(list=ls()) 
x=c("rgdal","rgeos","sp","geosphere","tidyr","broom","ggplot2")

lapply(x, library, character.only=TRUE) # load the required packages
library(dplyr)
library(starpolishr)
library(data.table)
library(haven)


setwd("Derived Data")
# plant by year file 
df=read_dta("plant_by_year_withloc.dta")

# drop if lat /lon info but no production info
df=df[df$X_merge!="using only (2)",]

# keep if fuel is coal
df=df[df$fuel=="coal",] 

# drop if AY =0 no production for that year 
df=df[df$AY_prod>0,]

#####################################################################
# subset out plants with lat /lon and those without:
dfmatched=df[df$X_merge=="matched",] # these plants have lat/lon

df_nomatch=df[df$X_merge=="master only",] # these plants do not have lat /lon from 2012

# cleaning: 
rm(df)
df_nomatch$Latitude=NULL; df_nomatch$Longitude=NULL

############################### Import coal plant closings file to get lat/lon ################ 
df_close=read.csv("../Raw Data/5. Electricity Data/eia_retirementlist_2019.csv", stringsAsFactors = FALSE, header=TRUE) # all plants that retired in 2019 or earlier

df_close=unique(df_close[,c("Plant.ID","Plant.State","Latitude","Longitude")]) # just get the lat/lon coordinates
# Plant State/Id are unique identifiers
# rename
df_close$PlantID=df_close$Plant.ID; df_close$Plant.ID=NULL
df_close$State=df_close$Plant.State; df_close$Plant.State=NULL

# merge in lat lon
dfmatchclose=merge(df_nomatch,df_close,by=c("PlantID","State"),all.x=TRUE) # checked

# Manually fill in for some missing

# Plant ID: 54243 = Brown Williamson Tobacco Coal Plant GA USA: http://globalenergyobservatory.org/geoid/217
dfmatchclose$Latitude[dfmatchclose$PlantID==54243 & dfmatchclose$State=="GA"]=32.8011; dfmatchclose$Longitude[dfmatchclose$PlantID==54243 & dfmatchclose$State=="GA"]=-83.693

# Plant ID: 62319  https://www.westernsugar.com/who-we-are/our-facilities/billings-mt-manufacturing-facility/
dfmatchclose$Latitude[dfmatchclose$PlantID==62319 & dfmatchclose$State=="MT"]=45.768820; dfmatchclose$Longitude[dfmatchclose$PlantID==62319 & dfmatchclose$State=="MT"]=-108.497260


# append 

dfcoalprod=rbind(dfmatched,dfmatchclose); rm(dfmatched, dfmatchclose, df_nomatch, df_close)

# just get the plants: 

dfcoal=unique(dfcoalprod[,c("PlantID","State","Latitude","Longitude")])
dfcoal=dfcoal[!is.na(dfcoal$Latitude),] # dataframe of all producing plants with a geocode
#checked
# dfcoalprod: plant production by year
# dfcoal: plant locations


###################################### Match to district ############## #######################################################

# convert coal df to spatial points 

coordinates(dfcoal)=cbind(dfcoal$Longitude,dfcoal$Latitude) # set the coordinates 
proj4string(dfcoal)=CRS("+proj=longlat +ellps=WGS84
+datum=WGS84 +no_defs +towgs84=0,0,0")

#### Import school district
setwd("../Raw Data/6. SEDA DATA")

sch_dist=readOGR(dsn="seda_shapefiles_2019_4.0", layer="seda_shapefiles_2019_4")

# only keep necessary columns:
sch_dist@data=sch_dist@data[, c("NAME","GEOID")]


sch_dist@data$leaid=as.numeric(sch_dist@data$GEOID) # checked

# create centroid of school district
centroid_sch=gCentroid(sch_dist, byid=TRUE) # CHECKED

# create spdf with centroid:

schdist_centr=SpatialPointsDataFrame(centroid_sch,data=sch_dist@data) # checked

schdist_centr=spTransform(schdist_centr, proj4string(dfcoal))

rm(centroid_sch)

dfout=lapply(1:nrow(dfcoal@data), function(ind){
  
  pt=dfcoal[ind,]
  
  distvec=distGeo(pt,schdist_centr)/1000 # distance from the coal plant to the school district
  
  
  
  distradius=which(distvec<100) 
  if(length(distradius)>0){
    schdist_radius=schdist_centr[distradius,]
    
    schdist_radius$distveckm=distGeo(pt, schdist_radius)/1000  # this is distance vec in km
    
    # create mini dataframe for this coal plant
    ptrep=pt[rep(seq_len(nrow(pt)), each = nrow(schdist_radius@data)), ] 
    
    # final dataframe
    dfout=data.frame(ptrep@data, 
                     schdist_radius@data)
    
    
    return(dfout)
  }
  
  
})


plant_dist=do.call(rbind,dfout)


##################################################
# final merge production with district info
# coalprod: plant x year production; # plant_dist: stacked plant_district match
prod_dist_year=merge(dfcoalprod,plant_dist,by=c("PlantID","State")) # checked 
# CLEANING
rm(dfcoal,dfcoalprod,dfout,schdist_centr, plant_dist)


prod_dist_year=prod_dist_year[,c("leaid","year","AY_prod","distveckm","PlantID","State")]

labelsvec=c("0","20","40","60","80")

breaksvec=c(0,20,40,60,80,100)
prod_dist_year$distcat=cut(prod_dist_year$distveckm, breaks=breaksvec,labels=labelsvec)

dfleaid_prod=prod_dist_year %>% group_by(leaid,year,distcat) %>% summarise(ayprodleaid=sum(AY_prod)) %>% ungroup()
####################### Fill in database: dist by bin by year 
yearvec=c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018)
distbinvec=as.numeric(labelsvec)
schdistlist=lapply(1:nrow(sch_dist@data), function(ind){
  
  pt=sch_dist@data[ind,]
  
  year=rep(yearvec, each=length(distbinvec))
  
  distcat=rep(distbinvec,length(yearvec))
  
  leaid=rep(pt$leaid,length(yearvec)*length(distbinvec))
  
  dfout=data.frame(leaid,year,distcat)
  
  
})

leaid_to_fill=do.call(rbind,schdistlist) 

# merge in dfleaid_prod:
dfleaid_prod=merge(leaid_to_fill,dfleaid_prod,by=c("leaid","year","distcat"),all.x=TRUE)


# if NA: zero prod
dfleaid_prod$ayprodleaid[is.na(dfleaid_prod$ayprodleaid)]=0  


dfleaid_prod$ayprodscale=dfleaid_prod$ayprodleaid/1000000; dfleaid_prod$ayprodleaid=NULL



#clean: 
rm(sch_dist, schdistlist,distbinvec,yearvec)
###################################################################


# Reshape the data frame
dfleaid=reshape(dfleaid_prod, idvar = c("leaid","year"), timevar = "distcat", direction = "wide")


################# merge in test scores #################### 
setwd("../../Derived Data")
library(lfe)
library(stargazer)


# Variable lists: 

###################################################
tempdf=read.csv("weatherdf.csv",header=TRUE,stringsAsFactors = FALSE)

mergetest_stack <- function(subject,dfleaid_in,tempdf_in,leaidkeep){

  if(subject=="mth"){
    df_test=read.csv("df_mth_noinst.csv", stringsAsFactors = FALSE, header=TRUE)

    filewritename=paste(subject,"_models_mth.tex",sep="")

  }
  else if(subject=="rla"){
    df_test=read.csv("df_rla_noinst.csv", stringsAsFactors = FALSE, header=TRUE)
    filewritename=paste(subject,"_models_rla.tex",sep="")

  }
  else{
    df_test=read.csv("df_avg_noinst.csv", stringsAsFactors = FALSE, header=TRUE)
    filewritename=paste(subject,"_models_avg.tex",sep="")

  }

  

  dfout=merge(df_test,dfleaid_in, by=c("leaid","year"),all.x=TRUE) # merge is perfect



  ##################################################################

  dfout=merge(dfout,tempdf_in,by=c("leaid","year"),all.x=TRUE)

 
}


df_avg=mergetest_stack("avg",dfleaid,tempdf,leaidkeep) 

# get number of unique districts
length(unique(df_avg$leaid))

fwrite(df_avg,"df_avg_coalplantinst.csv")

dfout=df_avg

dfout=dfout %>% drop_na( cs_mn_all_lag_cohort)
nrow(dfout)
length(unique(dfout$leaid))

dfout=dfout %>% drop_na( malpct_test,perecd,perblk , perhsp , malpct_test , pcttested, totenrl , perell,perspeced)
dfout$state_year=paste(dfout$stateabb,dfout$year,sep="_")
dfout$X=NULL
nrow(dfout)
length(unique(dfout$leaid))


# merge acs:
acsdf=read.csv("acs2009_2018.csv",stringsAsFactors = FALSE, header=TRUE)
acsdf$pctemploy=as.numeric(acsdf$pctemploy); acsdf$pctlaborforce=as.numeric(acsdf$pctlaborforce); acsdf$pctutility=as.numeric(acsdf$pctutility); acsdf$pctmanufac=as.numeric(acsdf$pctmanufac)
acsdf$leaid=acsdf$LEAID; acsdf$LEAID=NULL
acsdf$X=NULL
dfout=merge(dfout,acsdf,by=c("leaid","year"), all.x=TRUE)


distanceprod="ayprodscale.0 + ayprodscale.20"
econvar="pctemploy + pctmanufac + single_momall + baplusall + pctutility + pctlaborforce"

demogvar="perfrl + perblk + perhsp + perwht  + malpct_test + pcttested+ totenrl +  perell + perspeced "
sorting="moveinrate + moveoutrate"
tempvarsavg="ay_mean_snow  +aymean_high_num_snow+aymean_severe_num_snow+ay_avg_mean_max + ay_avg_mean_min + ay_avg_bint80  + ay_avg_bint90  + ay_avg_bint100 + ay_avg_bint20 + ay_avg_bint10+ ay_avg_bint0+ ay_mean_prcp + aymean_high_num_prcp+aymean_severe_num_prcp"

# Coal Plant IV ####################### 
model_iv_none=as.formula(paste("cs_mn_all~ cs_mn_all_lag_cohort  |factor(year) + factor(grade)+factor(leaid)|(avgPM25_9  ~ ", distanceprod ,")|leaid",sep="" ))
result_iv_none=felm(model_iv_none,dfout[dfout$testmonth=="May" ,],na.action=na.omit)


model_iv_demog=as.formula(paste("cs_mn_all~ cs_mn_all_lag_cohort + ",demogvar," |factor(year) + factor(grade)+factor(leaid)|(avgPM25_9  ~ ", distanceprod ,")|leaid",sep="" ))
result_iv_demog=felm(model_iv_demog,dfout[dfout$testmonth=="May" ,],na.action=na.omit)
result_iv_demog$stage1$iv1fstat$avgPM25_9


model_iv_demog_sort=as.formula(paste("cs_mn_all~ cs_mn_all_lag_cohort + ",demogvar," + ", econvar, " |factor(year)  + factor(grade)+factor(leaid)|(avgPM25_9  ~ ", distanceprod ,")|leaid",sep="" ))

result_iv_demog_sort=felm(model_iv_demog_sort,dfout[dfout$testmonth=="May" ,],na.action=na.omit)
result_iv_demog_sort$stage1$iv1fstat$avgPM25_9

model_iv_demogext=as.formula(paste("cs_mn_all~ cs_mn_all_lag_cohort + ", demogvar, "  + ", tempvarsavg ," + ", econvar, " |factor(year) + factor(grade)+factor(leaid)|(avgPM25_9 ~ ", distanceprod, ")|leaid",sep="" ))


result_iv_demogext=felm(model_iv_demogext,dfout[dfout$testmonth=="May" ,],na.action=na.omit)
result_iv_demogext$stage1$iv1fstat$avgPM25_9


ivstar=stargazer(result_iv_demog,result_iv_demog_sort,result_iv_demogext, keep="avgPM25_9", digits=4,out="../output/duquegilraine40km.tex")
